import warnings
warnings.filterwarnings('ignore')
# Data manipulation libraries
import numpy as np
import pandas as pd
# Visualization libraries
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
# Avoid Warnings
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
df = pd.read_csv('healthcare-dataset-stroke-data.csv')
print(df.shape)
df.head()
Dropping the id column as it's just an identifier
df.drop(['id'],axis=1,inplace=True)
#count of missing data
missing_values_count = df.isna().sum()
#find the percentage of missing data
total_cells = np.product(df.shape)
total_missing = missing_values_count.sum()
percent_missing = (total_missing / total_cells) * 100
print("Percentage of missing data from the dataset is : {}%".format(percent_missing))
Plotting a heatmap to check for missing data features
plt.figure(figsize = (12,6))
sns.heatmap(df.isnull())
plt.show()
Filling the missing data in bmi column with mean
df['bmi'].fillna(df['bmi'].mean(), inplace=True)
df.info()
Making different arrays for categorical and continuous features
cat_cols = ["gender","hypertension","heart_disease","ever_married","work_type","Residence_type","smoking_status","stroke"]
cont_cols = ["age","avg_glucose_level","bmi"]
fig,axes = plt.subplots(4,2,figsize = (16,16))
sns.set_style('darkgrid')
fig.suptitle("Count plot for various categorical features")
sns.countplot(ax=axes[0,0],data=df,x='gender')
sns.countplot(ax=axes[0,1],data=df,x='hypertension')
sns.countplot(ax=axes[1,0],data=df,x='heart_disease')
sns.countplot(ax=axes[1,1],data=df,x='ever_married')
sns.countplot(ax=axes[2,0],data=df,x='work_type')
sns.countplot(ax=axes[2,1],data=df,x='Residence_type')
sns.countplot(ax=axes[3,0],data=df,x='smoking_status')
sns.countplot(ax=axes[3,1],data=df,x='stroke')
plt.show()
fig = px.box(data_frame = df,
x = "age",
width = 800,
height = 300)
fig.update_layout({"template":"plotly_dark"})
fig.show()
fig = px.box(data_frame = df,
x = "avg_glucose_level",
width = 800,
height = 300)
fig.update_layout({"template":"plotly_dark"})
fig.show()
fig = px.box(data_frame = df,
x = "bmi",
width = 800,
height = 300)
fig.update_layout({"template":"plotly_dark"})
fig.show()
age = list(df['age'].values)
hist_data = [age]
group_labels = ['age']
colors = ['Orange']
fig = ff.create_distplot(hist_data,group_labels,show_hist = True,colors=colors)
fig.update_layout({"template":"plotly_dark"})
fig.show()
avg_glucose_level = list(df['avg_glucose_level'].values)
hist_data = [avg_glucose_level]
group_labels = ['avg_glucose_level']
colors = ['Orange']
fig = ff.create_distplot(hist_data,group_labels,show_hist = True,colors=colors)
fig.update_layout({"template":"plotly_dark"})
fig.show()
bmi = list(df['bmi'].values)
hist_data = [bmi]
group_labels = ["bmi"]
colors = ['Orange']
fig = ff.create_distplot(hist_data,group_labels,show_hist = True,colors=colors)
fig.update_layout({"template":"plotly_dark"})
fig.show()
cat_cols = ["gender","hypertension","heart_disease","ever_married","work_type","Residence_type","smoking_status","stroke"]
cont_cols = ["age","avg_glucose_level","bmi"]
cr = df[cont_cols].corr(method='pearson')
plt.figure(figsize = (6,6))
sns.heatmap(cr,cmap="coolwarm")
plt.show()
plt.figure(figsize=(8,8))
sns.set_style("darkgrid")
sns.scatterplot(data = df, x = 'age', y = 'avg_glucose_level', hue='stroke')
plt.show()
plt.figure(figsize=(8,8))
sns.set_style("darkgrid")
sns.scatterplot(data = df, x = 'avg_glucose_level', y = 'bmi', hue='stroke')
plt.show()
plt.figure(figsize=(8,8))
sns.set_style("darkgrid")
sns.scatterplot(data = df, x = 'age', y = 'bmi', hue='stroke')
plt.show()
plt.figure(figsize=(16,6))
plt.subplot(1,3,1)
sns.violinplot(x = 'stroke', y = 'age', data = df)
plt.subplot(1,3,2)
sns.violinplot(x = 'stroke', y = 'avg_glucose_level', data = df)
plt.subplot(1,3,3)
sns.violinplot(x = 'stroke', y = 'bmi', data = df)
plt.show()
plt.figure(figsize = (16,16))
sns.pairplot(df,hue='stroke')
plt.show()
df["gender"].value_counts()
df.drop(df[df['gender'] == 'Other'].index, inplace = True)
df["gender"].value_counts()
print("The number of people who don't have stroke : ", df['stroke'].value_counts()[0])
print("The number of people who don't have stroke : ", df['stroke'].value_counts()[1])
cond1 = df['avg_glucose_level'] > 170
cond2 = df['stroke'] == 1
print("The number of outliers in avg_glucose_level with stroke = 1 are : ", df[cond1 & cond2].shape)
cond3 = df['bmi'] > 47
cond4 = df['stroke'] == 1
print("The number of outliers in bmi with stroke = 1 are : ", df[cond3 & cond4].shape)
print("The shape before removing the BMI outliers : ",df.shape)
df.drop(df[df['bmi'] > 47].index, inplace = True)
print("The shape after removing the BMI outliers : ",df.shape)
plt.figure(figsize = (14,5))
sns.distplot(x=df['bmi'],color='red')
plt.show()
df.dtypes
# Label Encoding the categorical variables
from sklearn.preprocessing import LabelEncoder
object_cols = ["gender","ever_married","work_type","Residence_type","smoking_status"]
label_encoder = LabelEncoder()
for col in object_cols:
label_encoder.fit(df[col])
df[col] = label_encoder.transform(df[col])
# Using SMOTE
from imblearn.over_sampling import SMOTE
sampler = SMOTE(random_state = 42)
X = df.drop(['stroke'],axis=1)
y = df[['stroke']]
X,y= sampler.fit_resample(X,y['stroke'].values.ravel())
y = pd.DataFrame({'stroke':y})
sns.countplot(data = y, x = 'stroke', y= None)
plt.show()
# Joining back dataset
df = pd.concat([X,y],axis = 1)
df.head()
# shuffling the dataset before model development
df = df.sample(frac = 1)
import torch
import torch.nn as nn
cat_cols = ["gender","hypertension","heart_disease","ever_married","work_type","Residence_type","smoking_status"]
cont_cols = ["age","avg_glucose_level","bmi"]
y_col = ["stroke"]
for cat in cat_cols:
df[cat] = df[cat].astype('category')
df.dtypes
# stacking the categorical columns
cats = np.stack([df[col].cat.codes.values for col in cat_cols], 1)
cats[:5]
# converting the stack into tensor
cats = torch.tensor(cats, dtype = torch.int64)
cats[:5]
# stacking the continuous columns & converting to tensor
conts = np.stack([df[col].values for col in cont_cols], 1)
conts = torch.tensor(conts, dtype=torch.float)
conts[:5]
# converting target variable to tensor and flattening since CrossEntropyLoss expects a 1-d tensor
y = torch.tensor(df[y_col].values).flatten()
y[:5]
print(cats.shape)
print(conts.shape)
print(y.shape)
cat_szs = [len(df[col].cat.categories) for col in cat_cols]
emb_szs = [(size, min(50, (size+1)//2)) for size in cat_szs]
emb_szs
class HDTL(nn.Module):
def __init__(self, emb_szs, n_cont, out_sz, layers, p=0.5):
super().__init__()
self.embeds = nn.ModuleList([nn.Embedding(ni, nf) for ni,nf in emb_szs])
self.emb_drop = nn.Dropout(p)
self.bn_cont = nn.BatchNorm1d(n_cont)
layerlist = []
n_emb = sum((nf for ni,nf in emb_szs))
n_in = n_emb + n_cont
for i in layers:
layerlist.append(nn.Linear(n_in,i))
layerlist.append(nn.ReLU(inplace=True))
layerlist.append(nn.BatchNorm1d(i))
layerlist.append(nn.Dropout(p))
n_in = i
layerlist.append(nn.Linear(layers[-1],out_sz))
self.layers = nn.Sequential(*layerlist)
def forward(self, x_cat, x_cont):
embeddings = []
for i,e in enumerate(self.embeds):
embeddings.append(e(x_cat[:,i]))
x = torch.cat(embeddings, 1)
x = self.emb_drop(x)
x_cont = self.bn_cont(x_cont)
x = torch.cat([x, x_cont], 1)
x = self.layers(x)
return x
torch.manual_seed(42)
model = HDTL(emb_szs, conts.shape[1], 2, [400,200,100], p=0.2)
model
Bayesian Optimization
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
batch_size = 9000
test_size = 492
cat_train = cats[:batch_size-test_size]
cat_test = cats[batch_size-test_size:batch_size]
con_train = conts[:batch_size-test_size]
con_test = conts[batch_size-test_size:batch_size]
y_train = y[:batch_size-test_size]
y_test = y[batch_size-test_size:batch_size]
print(len(cat_train))
print(len(cat_test))
import time
start_time = time.time()
epochs = 320
losses = []
for i in range(epochs):
i+=1
y_pred = model(cat_train, con_train)
loss = criterion(y_pred, y_train)
losses.append(loss)
if i%25 == 1:
print(f'epoch: {i:3} loss: {loss.item():10.8f}')
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(f'epoch: {i:3} loss: {loss.item():10.8f}')
print(f'\nDuration: {time.time() - start_time:.0f} seconds')
plt.plot(range(epochs), losses)
plt.ylabel('Cross Entropy Loss')
plt.xlabel('epoch');
# TO EVALUATE THE ENTIRE TEST SET
with torch.no_grad():
y_val = model(cat_test, con_test)
loss = criterion(y_val, y_test)
print(f'CE Loss: {loss:.8f}')
rows = 200
correct = 0
groundTruth = []
predictedValues = []
print(f'{"MODEL OUTPUT":26} ARGMAX Y_TEST')
for i in range(rows):
print(f'{str(y_val[i]):26} {y_val[i].argmax():^7}{y_test[i]:^7}')
predictedValues.append(y_val[i].argmax().item())
groundTruth.append(y_test[i])
if y_val[i].argmax().item() == y_test[i]:
correct += 1
print(f'\n{correct} out of {rows} = {100*correct/rows:.2f}% correct')
from sklearn.metrics import f1_score, recall_score,precision_score,accuracy_score
hdtl_f1 = f1_score(groundTruth, predictedValues)
hdtl_rec = recall_score(groundTruth, predictedValues)
hdtl_prec = precision_score(groundTruth, predictedValues)
hdtl_acc = accuracy_score(groundTruth, predictedValues)
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(groundTruth, predictedValues, pos_label=2)
hdtl_auc = metrics.auc(fpr, tpr)
SGD Optimization
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
import time
start_time = time.time()
epochs = 320
losses = []
for i in range(epochs):
i+=1
y_pred = model(cat_train, con_train)
loss = criterion(y_pred, y_train)
losses.append(loss)
if i%25 == 1:
print(f'epoch: {i:3} loss: {loss.item():10.8f}')
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(f'epoch: {i:3} loss: {loss.item():10.8f}')
print(f'\nDuration: {time.time() - start_time:.0f} seconds')
plt.plot(range(epochs), losses)
plt.ylabel('Cross Entropy Loss')
plt.xlabel('epoch');
# TO EVALUATE THE ENTIRE TEST SET
with torch.no_grad():
y_val = model(cat_test, con_test)
loss = criterion(y_val, y_test)
print(f'CE Loss: {loss:.8f}')
rows = 200
correct = 0
groundTruth = []
predictedValues = []
print(f'{"MODEL OUTPUT":26} ARGMAX Y_TEST')
for i in range(rows):
print(f'{str(y_val[i]):26} {y_val[i].argmax():^7}{y_test[i]:^7}')
predictedValues.append(y_val[i].argmax().item())
groundTruth.append(y_test[i])
if y_val[i].argmax().item() == y_test[i]:
correct += 1
print(f'\n{correct} out of {rows} = {100*correct/rows:.2f}% correct')
from sklearn.metrics import f1_score, recall_score,precision_score,accuracy_score
hdtl1_f1 = f1_score(groundTruth, predictedValues)
hdtl1_rec = recall_score(groundTruth, predictedValues)
hdtl1_prec = precision_score(groundTruth, predictedValues)
hdtl1_acc = accuracy_score(groundTruth, predictedValues)
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(groundTruth, predictedValues, pos_label=2)
hdtl1_auc = metrics.auc(fpr, tpr)
For DNN
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import Normalizer
from tensorflow.keras.layers import Activation, Dense, Dropout, BatchNormalization, Input
from keras.models import Model
from keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
%matplotlib inline
plt.style.use('fivethirtyeight')
df = pd.read_csv('healthcare-dataset-stroke-data.csv')
df.drop(['id'],axis=1,inplace=True)
df.head()
df['bmi'].fillna(df['bmi'].mean(), inplace=True)
df.info()
from sklearn import preprocessing
# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()
# Encode labels in column 'species'.
df['gender']= label_encoder.fit_transform(df['gender'])
df['ever_married']= label_encoder.fit_transform(df['ever_married'])
df['work_type']= label_encoder.fit_transform(df['work_type'])
df['Residence_type']= label_encoder.fit_transform(df['Residence_type'])
df['smoking_status']= label_encoder.fit_transform(df['smoking_status'])
df['gender'].unique()
df['ever_married'].unique()
df['work_type'].unique()
df['Residence_type'].unique()
df['smoking_status'].unique()
X = df.drop('stroke', axis =1).values
y = df.stroke.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2)
nl = Normalizer()
nl.fit(X_train)
X_train = nl.transform(X_train)
X_dev, X_test, y_dev, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=2)
X_dev = nl.transform(X_dev)
X_test = nl.transform(X_test)
def dnn():
inputs = Input(name='inputs', shape=[X_train.shape[1],])
layer = Dense(128, name='FC1')(inputs)
layer = BatchNormalization(name='BC1')(layer)
layer = Activation('relu', name='Activation1')(layer)
layer = Dropout(0.3, name='Dropout1')(layer)
layer = Dense(128, name='FC2')(layer)
layer = BatchNormalization(name='BC2')(layer)
layer = Activation('relu', name='Activation2')(layer)
layer = Dropout(0.3, name='Dropout2')(layer)
layer = Dense(128, name='FC3')(layer)
layer = BatchNormalization(name='BC3')(layer)
layer = Dropout(0.3, name='Dropout3')(layer)
layer = Dense(1, name='OutLayer')(layer)
layer = Activation('sigmoid', name='sigmoid')(layer)
model = Model(inputs=inputs, outputs=layer)
return model
model = dnn()
model.summary()
Bayesian Optimization
model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])
reduce_lr = ReduceLROnPlateau()
early_stopping = EarlyStopping(patience=20, min_delta=0.0001)
model.fit(x=X_train, y=y_train, epochs=200, validation_data=(X_dev, y_dev), callbacks=[reduce_lr, early_stopping], verbose=0)
x_lst = [X_train, X_dev, X_test]
y_lst = [y_train, y_dev, y_test]
for i,(x,y) in enumerate(zip(x_lst, y_lst)):
y_pred = model.predict(x)
y_pred = np.around(y_pred)
y_pred = np.asarray(y_pred)
dnn_acc = accuracy_score(y, y_pred)
dnn_f1 = f1_score(y, y_pred)
dnn_rec = recall_score(y, y_pred)
dnn_prec = precision_score(y, y_pred)
fpr, tpr, thresholds = metrics.roc_curve(y, y_pred, pos_label=2)
dnn_auc = metrics.auc(fpr, tpr)
from keras.utils.np_utils import to_categorical
from sklearn.utils import class_weight
from sklearn.metrics import log_loss
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import MaxPooling1D
X = df.drop('stroke', axis =1)
y = df.stroke
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2)
X_train = X_train.values
X_test = X_test.values
X_train = X_train.reshape(-1, X_train.shape[1],1)
X_test = X_test.reshape(-1, X_test.shape[1],1)
Y_train = to_categorical(y_train)
Y_test = to_categorical(y_test)
def showResults(test, pred):
#target_names = ['positive', 'negative']
# print(classification_report(test, pred, target_names=target_names))
accuracy = accuracy_score(test, pred)
precision=precision_score(test, pred, average='weighted')
f1Score=f1_score(test, pred, average='weighted')
#loss=log_loss(test,pred)
print("Accuracy : {}".format(accuracy))
print("Precision : {}".format(precision))
print("f1Score : {}".format(f1Score))
#print("Loss : {}".format(loss))
cm=confusion_matrix(test, pred)
print(cm)
SGD Optimization
import tensorflow as tf
tf.keras.backend.clear_session()
model = tf.keras.models.Sequential([tf.keras.layers.Conv1D(filters=64,kernel_size=5,strides=1,padding="causal",activation="relu",input_shape=(X_train.shape[1],X_train.shape[2])),
tf.keras.layers.MaxPooling1D(pool_size=2, strides=1, padding="valid"),
tf.keras.layers.Conv1D(filters=32, kernel_size=3, strides=1, padding="causal", activation="relu"),
tf.keras.layers.MaxPooling1D(pool_size=2, strides=1, padding="valid"),
tf.keras.layers.LSTM(128, return_sequences=True),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(128, activation="relu"),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Dense(32, activation="relu"),
tf.keras.layers.Dropout(0.1),
tf.keras.layers.Dense(2)
])
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(5e-4,
decay_steps=1000000,
decay_rate=0.98,
staircase=False)
model.compile(loss=tf.keras.losses.MeanSquaredError(),
optimizer=tf.keras.optimizers.SGD(learning_rate=lr_schedule, momentum=0.8),
metrics=['acc'])
model.summary()
history = model.fit(X_train, Y_train,epochs=100,steps_per_epoch=200,validation_steps=200)
# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.savefig('loss.png', format='png', dpi=1200)
plt.show()
# Plot training & validation accuracy values
plt.plot(history.history['acc'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.savefig('accuracy.png', format='png', dpi=1200)
plt.show()
predictions = model.predict(X_test, verbose=1)
predictcv=np.argmax(predictions,axis=1)
actual_valuecv=np.argmax(Y_test,axis=1)
ens_acc = accuracy_score(actual_valuecv, predictcv)
ens_f1 = f1_score(actual_valuecv, predictcv)
ens_rec = recall_score(actual_valuecv, predictcv)
ens_prec = precision_score(actual_valuecv, predictcv)
fpr, tpr, thresholds = metrics.roc_curve(actual_valuecv, predictcv, pos_label=2)
ens_auc = metrics.auc(fpr, tpr)
X
y
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
SVM = SVC()
SVM.fit(X, y)
predictions = SVM.predict(X)
val1 = (accuracy_score(y, predictions)*100)
print("*Accuracy score for SVM: ", val1, "\n")
print("*Confusion Matrix for SVM: ")
print(confusion_matrix(y, predictions))
print("*Classification Report for SVM: ")
print(classification_report(y, predictions))
svm_f1 = f1_score(y, predictions)
svm_rec = recall_score(y, predictions)
svm_prec = precision_score(y, predictions)
fpr, tpr, thresholds = metrics.roc_curve(y, predictions, pos_label=2)
svm_auc = metrics.auc(fpr, tpr)
from sklearn.tree import DecisionTreeClassifier
DT = DecisionTreeClassifier(max_depth =3, random_state = 42)
DT.fit(X, y)
predictions = DT.predict(X)
val2 = (accuracy_score(y, predictions)*100)
print("*Accuracy score for DT: ", val2, "\n")
print("*Confusion Matrix for DT: ")
print(confusion_matrix(y, predictions))
print("*Classification Report for DT: ")
print(classification_report(y, predictions))
dt_f1 = f1_score(y, predictions)
dt_rec = recall_score(y, predictions)
dt_prec = precision_score(y, predictions)
fpr, tpr, thresholds = metrics.roc_curve(y, predictions, pos_label=2)
dt_auc = metrics.auc(fpr, tpr)
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier()
RF.fit(X, y)
predictions = RF.predict(X)
val3 = (accuracy_score(y, predictions)*100)
print("*Accuracy score for RF: ", val3, "\n")
print("*Confusion Matrix for RF: ")
print(confusion_matrix(y, predictions))
print("*Classification Report for RF: ")
print(classification_report(y, predictions))
rf_f1 = f1_score(y, predictions)
rf_rec = recall_score(y, predictions)
rf_prec = precision_score(y, predictions)
fpr, tpr, thresholds = metrics.roc_curve(y, predictions, pos_label=2)
rf_auc = metrics.auc(fpr, tpr)
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
clf1 = GaussianNB()
clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
clf3 = DecisionTreeClassifier()
eclf1 = VotingClassifier(estimators=[('nb', clf1), ('rf', clf2), ('dt', clf3)], voting='hard')
eclf1.fit(X, y)
predictions = eclf1.predict(X)
print("*Confusion Matrix for Voting Classifier: ")
print(confusion_matrix(y, predictions))
val4 = (accuracy_score(y, predictions)*100)
print("*Accuracy score for Voting: ", val4, "\n")
print("*Classification Report for Voting: ")
print(classification_report(y, predictions))
vot_f1 = f1_score(y, predictions)
vot_rec = recall_score(y, predictions)
vot_prec = precision_score(y, predictions)
fpr, tpr, thresholds = metrics.roc_curve(y, predictions, pos_label=2)
vot_auc = metrics.auc(fpr, tpr)
score = [val1,val2,val3,val4,ens_acc*100,dnn_acc*100,hdtl_acc*100,hdtl1_acc*100]
#make variabel for save the result and to show it
classifier = ('SVM','DT','RF','Voting','CNN+LSTM-RO','DNN-BO','HDTL-BO','HDTL-RO')
y_pos = np.arange(len(classifier))
print(y_pos)
print(score)
import matplotlib.pyplot as plt2
plt2.barh(y_pos, score, align='center', alpha=0.5,color='blue')
plt2.yticks(y_pos, classifier)
plt2.xlabel('Score')
plt2.title('Classification Performance')
plt2.show()
N = 8
ind = np.arange(N) # the x locations for the groups
width = 0.2 # the width of the bars
fig = plt.figure()
ax = fig.add_subplot(111)
yvals = [svm_prec*100,dt_prec*100,rf_prec*100,vot_prec*100,ens_prec*100,dnn_prec*100,hdtl_prec*100,hdtl1_prec*100]
rects1 = ax.bar(ind, yvals, width, color='r')
zvals = [svm_rec*100,dt_rec*100,rf_rec*100,vot_rec*100,ens_rec*100,dnn_rec*100,hdtl_rec*100,hdtl1_rec*100]
rects2 = ax.bar(ind+width, zvals, width, color='g')
kvals = [svm_f1*100,dt_f1*100,rf_f1*100,vot_f1*100,ens_f1*100,dnn_f1*100,hdtl_f1*100,hdtl1_f1*100]
rects3 = ax.bar(ind+width*2, kvals, width, color='b')
ax.set_ylabel('Scores')
ax.set_xticks(ind+width)
ax.set_xticklabels( ('SVM','DT','RF','Voting','CNN+LSTM-RO','DNN-BO','HDTL-BO','HDTL-RO') )
ax.legend( (rects1[0], rects2[0], rects3[0]), ('Precision', 'Recall', 'F1-SCore') )
def autolabel(rects):
for rect in rects:
h = rect.get_height()
ax.text(rect.get_x()+rect.get_width()/2., 1.05*h, '%d'%int(h),
ha='center', va='bottom')
autolabel(rects1)
autolabel(rects2)
autolabel(rects3)
plt.show()
import joblib
filename = 'model.sav'
joblib.dump(eclf1, filename)